knitr::opts_chunk$set(echo = TRUE)
# install.packages("Hmisc")
# install.packages("pastecs")
# install.packages("ggplot2")
# install.packages("Hmisc")
# install.packages("fastDummies")
# install.packages("lmtest")
# install.packages("lmtest")
# install.packages("caretEnsemble")
# install.packages("Amelia")
# install.packages("GGally")
library(ggplot2)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(corrplot)
## corrplot 0.92 loaded
library(caret)
## Loading required package: lattice
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-4
library(leaps)
library(reshape2)
library(gridExtra)
library(fastDummies)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(pastecs)
library(skimr)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.6      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.0 
## ✔ readr   2.1.2      ✔ forcats 0.5.1 
## ✔ purrr   0.3.4      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%()     masks ggplot2::%+%()
## ✖ psych::alpha()   masks ggplot2::alpha()
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ tidyr::expand()  masks Matrix::expand()
## ✖ tidyr::extract() masks pastecs::extract()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::first()   masks pastecs::first()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ dplyr::last()    masks pastecs::last()
## ✖ purrr::lift()    masks caret::lift()
## ✖ tidyr::pack()    masks Matrix::pack()
## ✖ tidyr::unpack()  masks Matrix::unpack()
library(caret)
library(caretEnsemble)
## 
## Attaching package: 'caretEnsemble'
## 
## The following object is masked from 'package:ggplot2':
## 
##     autoplot
library(psych)
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(mice)
## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(rpart)
library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:gridExtra':
## 
##     combine
## 
## The following object is masked from 'package:psych':
## 
##     outlier
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(nnet)
library(ROCR)
library(Metrics)
## 
## Attaching package: 'Metrics'
## 
## The following objects are masked from 'package:caret':
## 
##     precision, recall
library(caret)
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## 
## Attaching package: 'forecast'
## 
## The following object is masked from 'package:Metrics':
## 
##     accuracy
## 
## The following object is masked from 'package:caretEnsemble':
## 
##     autoplot
library(rpart)
library(rattle)
## Loading required package: bitops
## 
## Attaching package: 'bitops'
## 
## The following object is masked from 'package:Matrix':
## 
##     %&%
## 
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
## 
## Attaching package: 'rattle'
## 
## The following object is masked from 'package:randomForest':
## 
##     importance
library(ggplot2)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following object is masked from 'package:purrr':
## 
##     compact
library(rlist)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following object is masked from 'package:Metrics':
## 
##     auc
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ROSE)
## Loaded ROSE 0.0-4
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(rattle)
library(rpart.plot)
library(RColorBrewer)
# reading file
df = readr::read_csv("caravan-insurance-challenge.csv")
## Rows: 9822 Columns: 87
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): ORIGIN
## dbl (86): MOSTYPE, MAANTHUI, MGEMOMV, MGEMLEEF, MOSHOOFD, MGODRK, MGODPR, MG...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Step 1 Data Visulation and exploration

dimension = dim(df)
# total number of observations are 9822
# total number of variables are 87 

totalRows = dimension[1]

# data type of 86 variables is number and 1 variable is char, our target variable is number of policies bought remaining variables are categorical numeric.

head(df)
## # A tibble: 6 × 87
##   ORIGIN MOSTYPE MAANTHUI MGEMOMV MGEMLEEF MOSHOOFD MGODRK MGODPR MGODOV MGODGE
##   <chr>    <dbl>    <dbl>   <dbl>    <dbl>    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 train       33        1       3        2        8      0      5      1      3
## 2 train       37        1       2        2        8      1      4      1      4
## 3 train       37        1       2        2        8      0      4      2      4
## 4 train        9        1       3        3        3      2      3      2      4
## 5 train       40        1       4        2       10      1      4      1      4
## 6 train       23        1       2        1        5      0      5      0      5
## # … with 77 more variables: MRELGE <dbl>, MRELSA <dbl>, MRELOV <dbl>,
## #   MFALLEEN <dbl>, MFGEKIND <dbl>, MFWEKIND <dbl>, MOPLHOOG <dbl>,
## #   MOPLMIDD <dbl>, MOPLLAAG <dbl>, MBERHOOG <dbl>, MBERZELF <dbl>,
## #   MBERBOER <dbl>, MBERMIDD <dbl>, MBERARBG <dbl>, MBERARBO <dbl>, MSKA <dbl>,
## #   MSKB1 <dbl>, MSKB2 <dbl>, MSKC <dbl>, MSKD <dbl>, MHHUUR <dbl>,
## #   MHKOOP <dbl>, MAUT1 <dbl>, MAUT2 <dbl>, MAUT0 <dbl>, MZFONDS <dbl>,
## #   MZPART <dbl>, MINKM30 <dbl>, MINK3045 <dbl>, MINK4575 <dbl>, …
# explore top 5 rows

Descriptive analysis

# summary of main data set
# stat.desc(df)
# summary(df)

# missing values in main data set
paste0("Total missing values:", sum(is.na(df)))
## [1] "Total missing values:0"
desc = skim(df)

Plot graphs

unInsuredRows = df[df$CARAVAN == 0,]
insuredRows = df[df$CARAVAN == 1,]

peopleNotInsuredWithCaravan  = nrow(unInsuredRows);
peopleInsuredWithCaravan = nrow(insuredRows);


ratioOfPeopleWhoDontBought = peopleNotInsuredWithCaravan/totalRows * 100
ratioOfPeopleWhoBought = peopleInsuredWithCaravan/totalRows * 100
# 5.96% who bought insurance vs 94% who didn't buy the insurance, by statistics it looks like company is in loss people are not interested in buying they have to something to improve.

dat <- data.frame(
  policy_status = factor(c("Not Insured","Insured"), levels=c("Not Insured","Insured")),
  Count = c( peopleNotInsuredWithCaravan , peopleInsuredWithCaravan)
)

ggplot(data=dat, aes(x=policy_status, y=Count, fill=policy_status)) +
  geom_bar(colour="red", stat="identity")

Customer Main Type

ggplot(df,aes(factor(df$MOSHOOFD))) + 
    geom_bar(aes(fill = factor(df$CARAVAN))) + 
    labs(x="Customer Main type") +
    scale_fill_discrete(name = "CARAVAN") + 
    ggtitle("Caravan Policy based on Customer Main Type") +
    theme(plot.title = element_text(hjust = 0.5))

df$maintype = df$MOSHOOFD

nrow(df[df$maintype == 1 & df$CARAVAN == 1,])
## [1] 75
nrow(df[df$maintype == 2 & df$CARAVAN == 1,])
## [1] 103
nrow(df[df$maintype == 3 & df$CARAVAN == 1,])
## [1] 109
nrow(df[df$maintype == 4 & df$CARAVAN == 1,])
## [1] 0
nrow(df[df$maintype == 5 & df$CARAVAN == 1,])
## [1] 18
nrow(df[df$maintype == 6 & df$CARAVAN == 1,])
## [1] 9
nrow(df[df$maintype == 7 & df$CARAVAN == 1,])
## [1] 35
nrow(df[df$maintype == 8 & df$CARAVAN == 1,])
## [1] 151
nrow(df[df$maintype == 9 & df$CARAVAN == 1,])
## [1] 75
nrow(df[df$maintype == 10 & df$CARAVAN == 1,])
## [1] 11
# Here we wanted to see the which customer main type has the highest frequency/count of buying the insurance. Based on results, we see that there are atleast 4 main customer categories that buy insurance. However, for our ease and understanding purposes we will only consider the top 2. This brings us to select, category number 8 and 3, where 8 = Family with grown ups and 3 = Driven Growths

Customer sub type

ggplot(df,aes(factor(df$MOSTYPE))) + 
    geom_bar(aes(fill = factor(df$CARAVAN))) + 
    labs(x="Customer Sub type") +
    scale_fill_discrete(name = "CARAVAN") + 
    ggtitle("Policy Bought based on Customer sub Type") +
    theme(plot.title = element_text(hjust = 0.5))

df$subtype = df$MOSTYPE
nrow(df[df$subtype == 33 & df$CARAVAN == 1,])
## [1] 80
nrow(df[df$subtype == 8 & df$CARAVAN == 1,])
## [1] 72
# category 33 and 8 purchased more policies.
# Based on our main category which compromises of various sub-categories, we can see that sub-categories number 33 and 8 are porne to buying insurance. These should be considered as the characteristics/attributes of the types of customers that exist in the main customer category. hence, we could say that, those who are middle class and those who are low class but have large families have a higher chance of getting the insurance 

Age

ggplot(df,aes(factor(df$MGEMLEEF))) + 
    geom_bar(aes(fill = factor(df$CARAVAN))) + 
    geom_text(stat='count', aes(label=..count..), vjust=0) + 
    labs(x="Age Group") +
    scale_fill_discrete(name = "CARAVAN") + 
    ggtitle("Policy bought on age group") +
    theme(plot.title = element_text(hjust = 0.5))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.

df$age = df$MGEMLEEF
nrow(df[df$age == 1 & df$CARAVAN == 1,])
## [1] 1
nrow(df[df$age == 2 & df$CARAVAN == 1,])
## [1] 156
nrow(df[df$age == 3 & df$CARAVAN == 1,])
## [1] 303
nrow(df[df$age == 4 & df$CARAVAN == 1,])
## [1] 105
nrow(df[df$age == 5 & df$CARAVAN == 1,])
## [1] 20
nrow(df[df$age == 6 & df$CARAVAN == 1,])
## [1] 1
# Here we have explored to see what is the age range of the customers that buy the insurance. Based on our analysis we see that customers who are between the ages 40-50 are prone to buying insurance compared with others. Hence, we could say that it is among the many characteristics of the main customer group [3,8]

No of houses

ggplot(df,aes(factor(df$MAANTHUI))) + 
    geom_bar(aes(fill = factor(df$CARAVAN))) + 
    geom_text(stat='count', aes(label=..count..), vjust=0) + 
    labs(x="Number of houses") +
    scale_fill_discrete(name = "CARAVAN") + 
    ggtitle("Number of houses customer has who bought insurance") +
    theme(plot.title = element_text(hjust = 0.5))

df$noofhouses = df$MAANTHUI
nrow(df[df$noofhouses == 1 & df$CARAVAN == 1,])
## [1] 526
nrow(df[df$noofhouses == 2 & df$CARAVAN == 1,])
## [1] 59
nrow(df[df$noofhouses == 3 & df$CARAVAN == 1,])
## [1] 1
# Now we wanted to see who is prone to getting an insurance with respect to number of houses and we have found that customers having at least 1 house are likely to get the insurance.

No of house hold

ggplot(df,aes(factor(df$MGEMOMV))) + 
    geom_bar(aes(fill = factor(df$CARAVAN))) + 
    geom_text(stat='count', aes(label=..count..), vjust=0) + 
    labs(x="Number of house hold") +
    scale_fill_discrete(name = "CARAVAN") + 
    ggtitle("Number of house hold") +
    theme(plot.title = element_text(hjust = 0.5))

df$noofhousehold = df$MGEMOMV
nrow(df[df$noofhousehold == 1 & df$CARAVAN == 1,])
## [1] 11
nrow(df[df$noofhousehold == 2 & df$CARAVAN == 1,])
## [1] 195
nrow(df[df$noofhousehold == 3 & df$CARAVAN == 1,])
## [1] 275
df$hasThreeHouseHold = ifelse(df$noofhousehold == 3, 1, 0)
# Now we wanted to see who is prone to getting an insurance with respect to number of houses and we have found that customers having at least 1 house are likely to get the insurance.

Charactertics we found so far

Customer having 3 house hold

Customer have one house

Age of customer is between 40 to 50

Customer are Driven Growers

Customer belongs to Lower class large families

Correlation Analysis

corrplot(cor(df[, c("subtype","maintype", "age", "noofhouses", "noofhousehold", "CARAVAN")]), method = "number")

# From the correlation matrix we have below, we have some interesting insights. The reason to run the correlation matrix was to figure out variables of interest which might cause either overfitting or underfitting. # Here we can see that the variables of interest are positively correlated. One exception to this is the correlation between age and number of households. We see that there is a negative correlation between it. Which actually makes sense because, the greater the age, the no of households will decrease. 

Which varaible to select when they are highly postive correlated?

cor(df$subtype, df$CARAVAN)
## [1] -0.06074174
cor(df$maintype, df$CARAVAN)
## [1] -0.05930648
# we will choose which have high corelation with response variable in this case both are weak correlated doesn't matter what we really choose
# we will choose maintype

Categorizing Data

Number of Houses

table(df$MAANTHUI)
## 
##    1    2    3    4    5    6    7    8   10 
## 8915  821   64    4    3    3    8    2    2
df$MAANTHUI = replace(df$MAANTHUI, df$MAANTHUI > 2, 2)
df$OneHouse = ifelse(df$MAANTHUI ==1, 1, 0)
df$moreThanTwoHouse = ifelse(df$MAANTHUI > 2, 1, 0)


# by looking frequencies of houses we categorized houses into dummies.

Grouping sub customer to meanful customer type.

df$averageFamily = ifelse(df$MOSTYPE %in% c(12,11,9,10,13), 1, 0)
df$loners = ifelse(df$MOSTYPE %in% c(17,15,18,16,19), 1, 0)
df$conservativeFamilies = ifelse(df$MOSTYPE %in% c(39,38), 1, 0)
df$crusingSeniors = ifelse(df$MOSTYPE %in% c(26,25,28,27), 1, 0)
df$drivenGrowers = ifelse(df$MOSTYPE %in% c(6,7,8), 1, 0)
df$grownups = ifelse(df$MOSTYPE %in% c(33,34,35,36,37), 1, 0)
df$framers = ifelse(df$MOSTYPE %in% c(40,41), 1, 0)
df$livingWell = ifelse(df$MOSTYPE %in% c(20,21,22,23,24), 1, 0)
df$retired = ifelse(df$MOSTYPE %in% c(29,30,31,32), 1, 0)
df$successful = ifelse(df$MOSTYPE %in% c(1,2,3,4,5), 1, 0)

dat <- data.frame(
  Categorized_Customers = factor(c("averageFamily", "loners", "conservativeFamilies", "crusingSeniors", "drivenGrowers", "grownups", "framers", "livingWell", "retired", "successful"), levels=c("averageFamily", "loners", "conservativeFamilies", "crusingSeniors", "drivenGrowers", "grownups", "framers", "livingWell", "retired", "successful")),
  Count = c( sum(df$averageFamily), sum(df$loners), sum(df$conservativeFamilies), sum(df$crusingSeniors), sum(df$drivenGrowers), sum(df$grownups), sum(df$framers), sum(df$livingWell), sum(df$retired), sum(df$successful) )
)

ggplot(data=dat, aes(x=Categorized_Customers, y=Count, fill=Categorized_Customers)) +
  geom_bar(colour="red", stat="identity")

Income Conversion

# Converting 30k income into value
df$MINKM30_c = ifelse(df$MINKM30 == 1, 0.05 * 30000, df$MINKM30)
df$MINKM30_c = ifelse(df$MINKM30_c == 2, 0.17 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 3, 0.3 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 4, 0.43 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 5, 0.56 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 6, 0.69 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 7, 0.82 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 8, 0.94 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 9, 1 * 30000, df$MINKM30_c)


# Converting 45k income into value
df$MINK3045_c = ifelse(df$MINK3045 == 1, 0.05 * 45000, df$MINK3045)
df$MINK3045_c = ifelse(df$MINK3045_c == 2, 0.17 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 3, 0.3 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 4, 0.43 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 5, 0.56 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 6, 0.69 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 7, 0.82 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 8, 0.94 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 9, 1 * 45000, df$MINK3045_c)

# Converting 70k income into value
df$MINK4575_c = ifelse(df$MINK4575 == 1, 0.05 * 75000, df$MINK4575)
df$MINK4575_c = ifelse(df$MINK4575_c == 2, 0.17 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 3, 0.3 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 4, 0.43 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 5, 0.56 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 6, 0.69 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 7, 0.82 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 8, 0.94 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 9, 1 * 75000, df$MINK4575_c)

# Converting 122k income into value
df$MINK7512_c = ifelse(df$MINK7512 == 1, 0.05 * 122000, df$MINK7512)
df$MINK7512_c = ifelse(df$MINK7512_c == 2, 0.17 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 3, 0.3 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 4, 0.43 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 5, 0.56 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 6, 0.69 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 7, 0.82 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 8, 0.94 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 9, 1 * 122000, df$MINK7512_c)

# Converting 123k income into value
df$MINK123M_c = ifelse(df$MINK123M == 1, 0.05 * 123000, df$MINK123M)
df$MINK123M_c = ifelse(df$MINK123M_c == 2, 0.17 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 3, 0.3 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 4, 0.43 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 5, 0.56 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 6, 0.69 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 7, 0.82 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 8, 0.94 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 9, 1 * 123000, df$MINK123M_c)

# Average income
df$MINKGEM_c = (df$MINK123M_c + df$MINK7512_c + df$MINK4575_c + df$MINK3045_c + df$MINKM30_c)/5
hist(df$MINKGEM_c, main ="Income in numerical", xlab ="Income")

Converting age into numerical.

df$MGEMLEEF_c = ifelse(df$MGEMLEEF == 1, 25, df$MGEMLEEF)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 2, 35, df$MGEMLEEF_c)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 3, 45, df$MGEMLEEF_c)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 4, 55, df$MGEMLEEF_c)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 5, 65, df$MGEMLEEF_c)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 4, 75, df$MGEMLEEF_c)
# MOSTYPE : customer subtype
# MFWEKIND: Household with children
# MOPLLAAG: Lower level education
# MHHUUR :  Rented house
# MHKOOP :  Home owners
# MINKM30:  Income < 30.000 low income
# MINK7512: Income 75-122.000 high income
# MKOOPKLA: Purchasing power class
# PPERSAUT: Contribution car policies
# CARAVAN:  Number of mobile home policies 0 - 1

Out liers

plot_ly(x = ~df$MINKM30_c, y = ~df$MOSTYPE, type = "box", color = df$MOSTYPE)
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
# After having converted the categorical variable for income to numerical, here we take income less than 3000 against customer sub-categories and draw a box plot. From the below box plot, we can see that as soon as income crosses 10K, we begint to see a few outliers for certain sub-categories. Another important thing to note is that when income jumps above 20K, the distance of outliers starts to increase. We could say that we are seeing extreme outliers in income levels ranging from 25K to 30K. 

# Now the question is if we should keep outliers in our analysis, whether mild or extreme or delete both of them and then proceed?
plot_ly(y = ~df$MOSTYPE, x = ~df$MINK3045_c, type = "box", color = df$MOSTYPE)
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
plot_ly(x = df$MINK4575_c, y = df$MOSTYPE, type = "box", color = df$MOSTYPE)
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed

Class Im balance problem

prop.table(table(df$CARAVAN))
## 
##          0          1 
## 0.94033802 0.05966198
# 94% people didn't buy insurance and only 5% bought. under sampling problem

barplot(prop.table(table(df$CARAVAN)), col = rainbow(2), ylim = c(0,0.7), main = "Class Distribution")

Partition data

# training data
df_train = (df[df$ORIGIN == "train",])
df_train = (df_train[,-1])
table(df_train$CARAVAN)
## 
##    0    1 
## 5474  348
#testing data
df_test = (df[df$ORIGIN == "test",])
df_test = (df_test[,-1])
nrow(df_test)
## [1] 4000
table(df_test$CARAVAN)
## 
##    0    1 
## 3762  238

Solving class imbalance problem

over_train = ovun.sample(CARAVAN ~ ., data =df_train, method = "over", N =10948)$data
table(over_train$CARAVAN)
## 
##    0    1 
## 5474 5474
over_test = ovun.sample(CARAVAN ~ ., data =df_test, method = "over", N =nrow(df_test))$data
table(over_test$CARAVAN)
## 
##    0    1 
## 3762  238
# we are not fixing sampling problem in test data, i did this becuase i was having error with doing prediction

Convert train data set into factor

for(i in 1:ncol(over_train)){
over_train[,i] <- as.factor(over_train[,i])
}

Convert test data set into factor

for(i in 1:ncol(over_test)){
over_test[,i] <- as.factor(over_test[,i])
}
over_test$MINKGEM_c = as.numeric(over_test$MINKGEM_c)
over_train$MINKGEM_c = as.numeric(over_train$MINKGEM_c)

over_test$MGEMLEEF_c = as.numeric(over_test$MGEMLEEF_c)
over_train$MGEMLEEF_c = as.numeric(over_train$MGEMLEEF_c)

Draw Confusion matrix function

drewSummary = function(model) {
  summary(model)
}
drewMatrix = function(model, test_data) {
   predicted = predict(model, test_data, type = "response")
   predictedClass = ifelse(predicted>=0.5, 1, 0)
   confusionMatrix(as.factor(predictedClass), as.factor(test_data$CARAVAN), positive = "1")
}
drewAnova = function(model1, model2){
    anova(model1, model2, test = 'Chisq')
}
drewROC = function(model){
   predicted = predict(model, over_test, type = "response")
   predictedClass = ifelse(predicted>=0.5, 1, 0)
   r = roc(over_test$CARAVAN, predictedClass)
   plot.roc(r)
}
getRMSE = function(predictedClass){
  accuracy(predictedClass, as.numeric(over_test$CARAVAN))[2]
}

Correlation of Model 1 predictors

new_data = over_train
new_data$MOSHOOFD = as.numeric(new_data$MOSHOOFD)
new_data$MGEMOMV = as.numeric(new_data$MGEMOMV)
new_data$MINKGEM = as.numeric(new_data$MINKGEM)
new_data$MGEMLEEF = as.numeric(new_data$MGEMLEEF)
new_data$CARAVAN = as.numeric(new_data$CARAVAN)
new_data$OneHouse = as.numeric(new_data$OneHouse)

corrplot(cor(subset(new_data , select = c("MOSHOOFD", "MGEMOMV", "OneHouse", "MINKGEM", "MGEMLEEF", "CARAVAN"))), method = "number", type = "upper")

Model 1

set.seed(123)

logit.reg = glm(CARAVAN ~ MOSHOOFD + MGEMOMV + OneHouse + MINKGEM_c+MGEMLEEF_c, data = over_train, family = binomial (link = "logit"))

logit.reg$xlevels[["MGEMOMV"]] <- union(logit.reg$xlevels[["MGEMOMV"]], levels(over_test$MGEMOMV))

drewSummary(logit.reg)
## 
## Call:
## glm(formula = CARAVAN ~ MOSHOOFD + MGEMOMV + OneHouse + MINKGEM_c + 
##     MGEMLEEF_c, family = binomial(link = "logit"), data = over_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7121  -1.1587   0.3691   1.1464   1.9105  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.309e-01  2.067e-01  -3.052  0.00227 ** 
## MOSHOOFD2    4.785e-01  8.530e-02   5.609 2.04e-08 ***
## MOSHOOFD3   -1.825e-01  8.124e-02  -2.247  0.02465 *  
## MOSHOOFD4   -1.460e+01  1.212e+02  -0.120  0.90409    
## MOSHOOFD5   -1.046e+00  1.077e-01  -9.709  < 2e-16 ***
## MOSHOOFD6   -1.366e+00  1.620e-01  -8.432  < 2e-16 ***
## MOSHOOFD7   -7.616e-01  9.582e-02  -7.949 1.89e-15 ***
## MOSHOOFD8   -3.133e-01  7.217e-02  -4.341 1.42e-05 ***
## MOSHOOFD9   -2.058e-01  8.669e-02  -2.374  0.01760 *  
## MOSHOOFD10  -1.325e+00  1.333e-01  -9.935  < 2e-16 ***
## MGEMOMV2     2.640e-01  1.292e-01   2.044  0.04096 *  
## MGEMOMV3     2.890e-01  1.322e-01   2.186  0.02878 *  
## MGEMOMV4     3.228e-01  1.433e-01   2.252  0.02431 *  
## MGEMOMV5     2.260e-01  2.216e-01   1.020  0.30795    
## OneHouse1    6.377e-02  7.015e-02   0.909  0.36328    
## MINKGEM_c    1.576e-03  2.624e-04   6.004 1.92e-09 ***
## MGEMLEEF_c   9.684e-02  3.096e-02   3.128  0.00176 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15177  on 10947  degrees of freedom
## Residual deviance: 14376  on 10931  degrees of freedom
## AIC: 14410
## 
## Number of Fisher Scoring iterations: 13
drewMatrix(logit.reg, over_test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  221    3
##          1 3541  235
##                                           
##                Accuracy : 0.114           
##                  95% CI : (0.1043, 0.1243)
##     No Information Rate : 0.9405          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0058          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.98739         
##             Specificity : 0.05875         
##          Pos Pred Value : 0.06224         
##          Neg Pred Value : 0.98661         
##              Prevalence : 0.05950         
##          Detection Rate : 0.05875         
##    Detection Prevalence : 0.94400         
##       Balanced Accuracy : 0.52307         
##                                           
##        'Positive' Class : 1               
## 
drewROC(logit.reg)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

# getRMSE(logit.reg)
# drewAnova(logit.reg)

# we did regression with/out house we see a minimal affect i.e with house model is predicting 60 true positives without it's predicting 65 and it's not significant, so we decided not to include this variable.
# difference in deviance =  Null deviance (15177) - 14398 = 872
train_2 = over_train
train_2 = subset(over_train, select = -c(maintype,subtype,age,noofhouses,noofhousehold,hasThreeHouseHold,OneHouse,moreThanTwoHouse,averageFamily,loners,conservativeFamilies,crusingSeniors,drivenGrowers,grownups,framers,livingWell,retired,successful,MINKM30_c,MINK3045_c,MINK4575_c,MINK7512_c,MINK123M_c,MINKGEM_c,MGEMLEEF_c,MOSHOOFD,MGEMOMV,MAANTHUI,MINKGEM,MGEMLEEF) )
length(train_2)
## [1] 81

Droping 5 variables that we have used in first regression. and dummies we have created earlier.

new_df  = train_2
for(i in 1:ncol(new_df)){
new_df[,i] <- as.integer(new_df[,i])
}

Model 2 | Building model using correlation Analysis

Pre processing

zv = apply(new_df, 2, function(x) length(unique(x)) == 1)
dfr = new_df[, !zv]
n=length(colnames(dfr))
correlationMatrix = cor(dfr[,1:n],use="complete.obs")
summary(correlationMatrix[upper.tri(correlationMatrix)])
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.999761 -0.024534 -0.001756  0.010891  0.037462  0.983494
# After removing our suspected predictors we still have strong positive correlation with 0.98% and strong negative corelation with -0.99%, we need to find which of them are highly correlated
high = findCorrelation(correlationMatrix, cutoff = 0.75, names = TRUE)
high
##  [1] "MHHUUR"   "MZPART"   "MSKA"     "MRELOV"   "PBRAND"   "PWAPART" 
##  [7] "PTRACTOR" "APERSAUT" "AWABEDR"  "MGODGE"   "AAANHANG" "PWALAND" 
## [13] "PZEILPL"  "PBESAUT"  "APLEZIER" "PBYSTAND" "PGEZONG"  "AWERKT"  
## [19] "AWAOREG"  "PLEVEN"   "PFIETS"   "PINBOED"  "PBROM"    "PPERSONG"
## [25] "PMOTSCO"  "AVRAAUT"
# there are 26 variables which are correlated with each other, before dropping them we need to see how they are correlated with response variable.
length(high)
## [1] 26
target_cor_df = data.frame(CARAVAN = cor(df_train[,sort(high)], df_train[, "CARAVAN"]))



cor_df = target_cor_df[order(target_cor_df$CARAVAN,decreasing = T),,drop=F]

excludedVariables = row.names(cor_df[cor_df$CARAVAN < 0.1, ,drop=F])
excludedVariables
##  [1] "PWAPART"  "PBRAND"   "MSKA"     "PBYSTAND" "MZPART"   "PGEZONG" 
##  [7] "PFIETS"   "AWAOREG"  "PLEVEN"   "PZEILPL"  "AAANHANG" "PMOTSCO" 
## [13] "PINBOED"  "AWABEDR"  "PBESAUT"  "AVRAAUT"  "PPERSONG" "PTRACTOR"
## [19] "AWERKT"   "PWALAND"  "MGODGE"   "PBROM"    "MRELOV"   "MHHUUR"
paste0("excluding total variables from main data set ", length(excludedVariables))
## [1] "excluding total variables from main data set 24"
# There are 24 variables which are less correlated with response variable and having correlation coefficient less than 0.1 so we will exclude them.
train_2 = data.frame(train_2[, !colnames(train_2) %in% excludedVariables])
names(train_2)
##  [1] "MOSTYPE"  "MGODRK"   "MGODPR"   "MGODOV"   "MRELGE"   "MRELSA"  
##  [7] "MFALLEEN" "MFGEKIND" "MFWEKIND" "MOPLHOOG" "MOPLMIDD" "MOPLLAAG"
## [13] "MBERHOOG" "MBERZELF" "MBERBOER" "MBERMIDD" "MBERARBG" "MBERARBO"
## [19] "MSKB1"    "MSKB2"    "MSKC"     "MSKD"     "MHKOOP"   "MAUT1"   
## [25] "MAUT2"    "MAUT0"    "MZFONDS"  "MINKM30"  "MINK3045" "MINK4575"
## [31] "MINK7512" "MINK123M" "MKOOPKLA" "PWABEDR"  "PPERSAUT" "PVRAAUT" 
## [37] "PAANHANG" "PWERKT"   "PWAOREG"  "PPLEZIER" "AWAPART"  "AWALAND" 
## [43] "APERSAUT" "ABESAUT"  "AMOTSCO"  "ATRACTOR" "ABROM"    "ALEVEN"  
## [49] "APERSONG" "AGEZONG"  "ABRAND"   "AZEILPL"  "APLEZIER" "AFIETS"  
## [55] "AINBOED"  "ABYSTAND" "CARAVAN"
length(train_2)
## [1] 57
# corrplot(cor(train_3), method = "number")
# 24 + 5 (predictors in model) = 29
# around 29 variables have been excluded from set so far next step would be to find good predictors which are not highly correlated each other and are significant.
# we have reduce dimension from 86 to 57
# corelation between no of car policy and carvan
cor(df_train$APERSAUT, df_train$CARAVAN)
## [1] 0.1442105
# # corelation between contribution car policies and carvan
cor(df_train$PPERSAUT, df_train$CARAVAN)
## [1] 0.1509097
# # corelation between  Purchasing power class and carvan
cor(df_train$MKOOPKLA, df_train$CARAVAN)
## [1] 0.09593826
new_train_2 = train_2
for(i in 1:ncol(new_train_2)){
new_train_2[,i] <- as.numeric(new_train_2[,i])
}
cor_response = data.frame("ind_var" = colnames(new_train_2), "dep_var" = "CARAVAN", "cor_coeff" = 0, "p_values" = 0)

for (i in colnames(new_train_2)){
    cor_test <- cor.test(new_train_2[,i], new_train_2[,"CARAVAN"])
    cor_response[cor_response$ind_var == i, "correlation_coefficient"] = cor_test$estimate
    cor_response[cor_response$ind_var == i, "p_values"] = cor_test$p.value
}
cor_response[order(cor_response$cor_coeff, decreasing = T),]
##     ind_var dep_var cor_coeff      p_values correlation_coefficient
## 1   MOSTYPE CARAVAN         0  4.415694e-46            -0.135586889
## 2    MGODRK CARAVAN         0  2.583334e-01             0.010803843
## 3    MGODPR CARAVAN         0  3.231555e-12             0.066521701
## 4    MGODOV CARAVAN         0  3.624139e-01             0.008705328
## 5    MRELGE CARAVAN         0  2.020100e-65             0.162193454
## 6    MRELSA CARAVAN         0  2.489261e-14            -0.072761240
## 7  MFALLEEN CARAVAN         0  4.095222e-41            -0.127830892
## 8  MFGEKIND CARAVAN         0  2.308525e-01             0.011452092
## 9  MFWEKIND CARAVAN         0  1.871244e-16             0.078544087
## 10 MOPLHOOG CARAVAN         0  2.694548e-65             0.162035905
## 11 MOPLMIDD CARAVAN         0  2.549701e-22             0.092679218
## 12 MOPLLAAG CARAVAN         0  5.086443e-87            -0.187321638
## 13 MBERHOOG CARAVAN         0  4.801007e-40             0.126097758
## 14 MBERZELF CARAVAN         0  3.436983e-06             0.044355711
## 15 MBERBOER CARAVAN         0  2.812603e-39            -0.124837889
## 16 MBERMIDD CARAVAN         0  6.530849e-25             0.098282054
## 17 MBERARBG CARAVAN         0  1.966259e-20            -0.088376598
## 18 MBERARBO CARAVAN         0  1.086222e-37            -0.122192046
## 19    MSKB1 CARAVAN         0  8.813456e-12             0.065163033
## 20    MSKB2 CARAVAN         0  4.431230e-01             0.007330476
## 21     MSKC CARAVAN         0  6.375227e-19            -0.084775686
## 22     MSKD CARAVAN         0  1.023999e-49            -0.140985103
## 23   MHKOOP CARAVAN         0  8.026033e-79             0.178225584
## 24    MAUT1 CARAVAN         0  6.500578e-54             0.146964982
## 25    MAUT2 CARAVAN         0  3.165699e-02             0.020535970
## 26    MAUT0 CARAVAN         0  2.805594e-69            -0.166974459
## 27  MZFONDS CARAVAN         0  1.652767e-35            -0.118455005
## 28  MINKM30 CARAVAN         0  6.011566e-78            -0.177226008
## 29 MINK3045 CARAVAN         0  6.399294e-01            -0.004471342
## 30 MINK4575 CARAVAN         0  1.570240e-35             0.118493703
## 31 MINK7512 CARAVAN         0  3.180369e-38             0.123087973
## 32 MINK123M CARAVAN         0  5.496601e-01            -0.005718413
## 33 MKOOPKLA CARAVAN         0 9.928322e-101             0.201559037
## 34  PWABEDR CARAVAN         0  8.057106e-01             0.002350997
## 35 PPERSAUT CARAVAN         0 7.707523e-290             0.337512725
## 36  PVRAAUT CARAVAN         0  3.485079e-03            -0.027917884
## 37 PAANHANG CARAVAN         0  1.397315e-02             0.023490693
## 38   PWERKT CARAVAN         0  1.875280e-05            -0.040885355
## 39  PWAOREG CARAVAN         0  1.281517e-08             0.054332527
## 40 PPLEZIER CARAVAN         0  7.019402e-28             0.104330662
## 41  AWAPART CARAVAN         0  1.924030e-92             0.193087412
## 42  AWALAND CARAVAN         0  1.186585e-06            -0.046408075
## 43 APERSAUT CARAVAN         0 4.738411e-225             0.299082035
## 44  ABESAUT CARAVAN         0  1.034142e-01            -0.015565015
## 45  AMOTSCO CARAVAN         0  7.869497e-02             0.016805316
## 46 ATRACTOR CARAVAN         0  1.650151e-05            -0.041156296
## 47    ABROM CARAVAN         0  7.781814e-37            -0.120741653
## 48   ALEVEN CARAVAN         0  2.431827e-12             0.066901849
## 49 APERSONG CARAVAN         0  1.563785e-01            -0.013546952
## 50  AGEZONG CARAVAN         0  5.895091e-08             0.051790072
## 51   ABRAND CARAVAN         0  7.039583e-60             0.155053827
## 52  AZEILPL CARAVAN         0  2.039983e-04             0.035488595
## 53 APLEZIER CARAVAN         0  3.017601e-35             0.117999215
## 54   AFIETS CARAVAN         0  3.534313e-11             0.063235684
## 55  AINBOED CARAVAN         0  2.257429e-04             0.035243207
## 56 ABYSTAND CARAVAN         0  5.291765e-23             0.094187928
## 57  CARAVAN CARAVAN         0  0.000000e+00             1.000000000
corrplot(cor(subset(new_train_2 , select = c(-CARAVAN))), method = "square", type = "upper")

# corelation between Contribution car policies and number of car policies 
cor(df_train$PPERSAUT, df_train$APERSAUT)
## [1] 0.9161545
# There is a high correlation between car policies and number of car policies we will exclude a variable which has less correlation with response variable.
# it's not always necessary to see how much it relates which response variable but it's good as it's tells us how much response variable changes for given predictor.
cor(df_train[ , c("PPERSAUT", "APERSAUT")], df_train[ , "CARAVAN"])
##            CARAVAN
## PPERSAUT 0.1509097
## APERSAUT 0.1442105
# Contribution car policies is more correlated with response variable so we exclude Number of car policies
train_2 = data.frame(train_2[ , !colnames(train_2) %in% c("APERSAUT")])
paste0("after removing APERSAUT dimension is", length(train_2))
## [1] "after removing APERSAUT dimension is56"
# final variable selected
names(train_2)
##  [1] "MOSTYPE"  "MGODRK"   "MGODPR"   "MGODOV"   "MRELGE"   "MRELSA"  
##  [7] "MFALLEEN" "MFGEKIND" "MFWEKIND" "MOPLHOOG" "MOPLMIDD" "MOPLLAAG"
## [13] "MBERHOOG" "MBERZELF" "MBERBOER" "MBERMIDD" "MBERARBG" "MBERARBO"
## [19] "MSKB1"    "MSKB2"    "MSKC"     "MSKD"     "MHKOOP"   "MAUT1"   
## [25] "MAUT2"    "MAUT0"    "MZFONDS"  "MINKM30"  "MINK3045" "MINK4575"
## [31] "MINK7512" "MINK123M" "MKOOPKLA" "PWABEDR"  "PPERSAUT" "PVRAAUT" 
## [37] "PAANHANG" "PWERKT"   "PWAOREG"  "PPLEZIER" "AWAPART"  "AWALAND" 
## [43] "ABESAUT"  "AMOTSCO"  "ATRACTOR" "ABROM"    "ALEVEN"   "APERSONG"
## [49] "AGEZONG"  "ABRAND"   "AZEILPL"  "APLEZIER" "AFIETS"   "AINBOED" 
## [55] "ABYSTAND" "CARAVAN"
length(train_2)
## [1] 56
step.wise1 = glm(CARAVAN ~ ., data = train_2, family = binomial(link = "logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#summary(step.wise1)
# difference between null deviance. = 15177 - 12386 = 2791
# step(step.wise1, direction = "backwards")
# after running backward dimension has reduce to from 56 to 41 variables
# null deviance = 15177 - 12394 = 2783
for(i in 1:ncol(train_2)){
train_2[,i] <- as.factor(train_2[,i])
}

Model 2

# model.2 = glm(formula = CARAVAN ~ MAANTHUI + MGEMOMV + MGEMLEEF + MOSHOOFD + 
#     MGODRK + MGODPR + MGODOV + MGODGE + MRELGE + MRELSA + MFWEKIND + MOPLMIDD, family = binomial(link = "logit"), data = train_2)


model.2 = glm(formula = CARAVAN ~ MOSTYPE + MGODRK + MGODPR + MGODOV + 
    MRELGE + MRELSA + MOPLMIDD + MOPLLAAG + MBERHOOG + MBERZELF + 
    MBERBOER + MBERMIDD + MBERARBG + MBERARBO + MSKC + MSKD + 
    MHKOOP + MAUT1 + MAUT2 + MAUT0 + MINK3045 + MINK7512 + MINK123M + 
    MKOOPKLA + PPERSAUT + PMOTSCO + PVRAAUT + PAANHANG + PWERKT + 
    PWAOREG + PPLEZIER + AWAPART + AWALAND + ABROM + ALEVEN + 
    APERSONG + AGEZONG + ABRAND + APLEZIER + AFIETS + ABYSTAND, 
    family = binomial(link = "logit"), data = over_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model.2$xlevels[["MSKD"]] <- union(model.2$xlevels[["MSKD"]], levels(over_test$MSKD))
model.2$xlevels[["MAUT2"]] <- union(model.2$xlevels[["MAUT2"]], levels(over_test$MAUT2))
model.2$xlevels[["MINK123M"]] <- union(model.2$xlevels[["MINK123M"]], levels(over_test$MINK123M))
model.2$xlevels[["PPERSAUT"]] <- union(model.2$xlevels[["PPERSAUT"]], levels(over_test$PPERSAUT))
model.2$xlevels[["PVRAAUT"]] <- union(model.2$xlevels[["PVRAAUT"]], levels(over_test$PVRAAUT))
model.2$xlevels[["PWERKT"]] <- union(model.2$xlevels[["PWERKT"]], levels(over_test$PWERKT))
model.2$xlevels[["ABROM"]] <- union(model.2$xlevels[["ABROM"]], levels(over_test$ABROM))
model.2$xlevels[["ALEVEN"]] <- union(model.2$xlevels[["ALEVEN"]], levels(over_test$ALEVEN))
model.2$xlevels[["ABRAND"]] <- union(model.2$xlevels[["ABRAND"]], levels(over_test$ABRAND))
model.2$xlevels[["AFIETS"]] <- union(model.2$xlevels[["AFIETS"]], levels(over_test$AFIETS))

drewSummary(model.2)
## 
## Call:
## glm(formula = CARAVAN ~ MOSTYPE + MGODRK + MGODPR + MGODOV + 
##     MRELGE + MRELSA + MOPLMIDD + MOPLLAAG + MBERHOOG + MBERZELF + 
##     MBERBOER + MBERMIDD + MBERARBG + MBERARBO + MSKC + MSKD + 
##     MHKOOP + MAUT1 + MAUT2 + MAUT0 + MINK3045 + MINK7512 + MINK123M + 
##     MKOOPKLA + PPERSAUT + PMOTSCO + PVRAAUT + PAANHANG + PWERKT + 
##     PWAOREG + PPLEZIER + AWAPART + AWALAND + ABROM + ALEVEN + 
##     APERSONG + AGEZONG + ABRAND + APLEZIER + AFIETS + ABYSTAND, 
##     family = binomial(link = "logit"), data = over_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3431  -0.6506   0.0000   0.7786   2.1267  
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -7.41685 4579.63857  -0.002 0.998708    
## MOSTYPE2      -0.10324    0.71288  -0.145 0.884854    
## MOSTYPE3      -0.16883    0.66344  -0.254 0.799126    
## MOSTYPE4      -1.08821    0.71197  -1.528 0.126402    
## MOSTYPE5       2.72226    1.60590   1.695 0.090046 .  
## MOSTYPE6      -0.47535    0.26644  -1.784 0.074417 .  
## MOSTYPE7      -1.14561    0.69688  -1.644 0.100193    
## MOSTYPE8       0.55079    0.47531   1.159 0.246531    
## MOSTYPE9       2.21767    1.54445   1.436 0.151033    
## MOSTYPE10     -0.28980    0.26709  -1.085 0.277918    
## MOSTYPE11     -0.26363    0.65487  -0.403 0.687265    
## MOSTYPE12      1.93786    0.48671   3.982 6.85e-05 ***
## MOSTYPE13     -1.21544    0.66186  -1.836 0.066299 .  
## MOSTYPE15    -27.10438 3095.65811  -0.009 0.993014    
## MOSTYPE16    -28.27421 1749.33018  -0.016 0.987104    
## MOSTYPE17    -17.98471 1879.42406  -0.010 0.992365    
## MOSTYPE18    -30.12218 1771.39099  -0.017 0.986433    
## MOSTYPE19    -16.14076 3176.46592  -0.005 0.995946    
## MOSTYPE20    -13.14323 1366.24381  -0.010 0.992324    
## MOSTYPE21    -28.20314 1992.20104  -0.014 0.988705    
## MOSTYPE22    -13.46425 1366.24373  -0.010 0.992137    
## MOSTYPE23      2.14768    1.59505   1.346 0.178153    
## MOSTYPE24    -13.31532 1366.24376  -0.010 0.992224    
## MOSTYPE25    -13.48114 1366.24395  -0.010 0.992127    
## MOSTYPE26    -11.79190 1366.24401  -0.009 0.993114    
## MOSTYPE27    -14.27165 1366.24401  -0.010 0.991666    
## MOSTYPE28    -30.71025 1698.47508  -0.018 0.985574    
## MOSTYPE29      2.57630    1.61237   1.598 0.110079    
## MOSTYPE30    -12.50424 1366.24377  -0.009 0.992698    
## MOSTYPE31    -12.62243 1366.24395  -0.009 0.992629    
## MOSTYPE32    -12.14105 1366.24395  -0.009 0.992910    
## MOSTYPE33      3.71042    1.58307   2.344 0.019087 *  
## MOSTYPE34     -0.41054    0.67496  -0.608 0.543026    
## MOSTYPE35      1.64611    1.51092   1.089 0.275945    
## MOSTYPE36      4.32379    1.58836   2.722 0.006485 ** 
## MOSTYPE37      2.98559    1.54804   1.929 0.053777 .  
## MOSTYPE38      3.10214    1.54153   2.012 0.044180 *  
## MOSTYPE39      2.29400    1.50275   1.527 0.126878    
## MOSTYPE40    -16.24945  638.31965  -0.025 0.979691    
## MOSTYPE41      2.06908    1.55430   1.331 0.183124    
## MGODRK1        0.40285    0.09280   4.341 1.42e-05 ***
## MGODRK2        0.33758    0.10358   3.259 0.001118 ** 
## MGODRK3       -0.76994    0.23278  -3.308 0.000941 ***
## MGODRK4       -1.16676    0.47268  -2.468 0.013572 *  
## MGODRK5        1.28077    0.52556   2.437 0.014812 *  
## MGODRK6        0.40570    0.67000   0.606 0.544830    
## MGODRK7      -15.75900 2433.76873  -0.006 0.994834    
## MGODRK8      -15.21832 3403.75972  -0.004 0.996433    
## MGODRK9      -14.94526 2408.35442  -0.006 0.995049    
## MGODPR1        0.78586    0.55549   1.415 0.157156    
## MGODPR2        2.55851    0.50764   5.040 4.65e-07 ***
## MGODPR3        2.45086    0.50261   4.876 1.08e-06 ***
## MGODPR4        2.67961    0.50108   5.348 8.91e-08 ***
## MGODPR5        2.51578    0.49812   5.051 4.40e-07 ***
## MGODPR6        2.33213    0.49841   4.679 2.88e-06 ***
## MGODPR7        3.22119    0.50732   6.349 2.16e-10 ***
## MGODPR8        2.81991    0.58187   4.846 1.26e-06 ***
## MGODPR9        2.53108    0.53082   4.768 1.86e-06 ***
## MGODOV1       -0.52840    0.09715  -5.439 5.35e-08 ***
## MGODOV2       -0.04173    0.09437  -0.442 0.658345    
## MGODOV3        0.46006    0.16416   2.802 0.005072 ** 
## MGODOV4       -0.89691    0.25812  -3.475 0.000511 ***
## MGODOV5        1.59640    0.51383   3.107 0.001891 ** 
## MRELGE1      -18.17325  416.44469  -0.044 0.965192    
## MRELGE2       -2.18561    0.58357  -3.745 0.000180 ***
## MRELGE3       -0.68288    0.52201  -1.308 0.190813    
## MRELGE4       -1.14784    0.50224  -2.285 0.022288 *  
## MRELGE5       -0.77731    0.48886  -1.590 0.111824    
## MRELGE6       -1.01538    0.49558  -2.049 0.040473 *  
## MRELGE7       -0.41238    0.49275  -0.837 0.402650    
## MRELGE8       -0.34251    0.51075  -0.671 0.502468    
## MRELGE9       -0.38500    0.49606  -0.776 0.437683    
## MRELSA1        0.02736    0.09511   0.288 0.773600    
## MRELSA2        0.04240    0.11325   0.374 0.708126    
## MRELSA3        1.20022    0.23333   5.144 2.69e-07 ***
## MRELSA4       -1.82819    0.51742  -3.533 0.000410 ***
## MRELSA5      -17.11939 1300.75148  -0.013 0.989499    
## MRELSA6      -18.31177 1560.94502  -0.012 0.990640    
## MRELSA7       -0.79912 6535.91926   0.000 0.999902    
## MOPLMIDD1     -0.32384    0.31794  -1.019 0.308403    
## MOPLMIDD2     -0.82103    0.27005  -3.040 0.002363 ** 
## MOPLMIDD3     -1.20389    0.26250  -4.586 4.51e-06 ***
## MOPLMIDD4     -1.61346    0.26607  -6.064 1.33e-09 ***
## MOPLMIDD5     -1.62228    0.27624  -5.873 4.29e-09 ***
## MOPLMIDD6     -1.67106    0.30278  -5.519 3.41e-08 ***
## MOPLMIDD7     -1.68838    0.34193  -4.938 7.90e-07 ***
## MOPLMIDD8     -2.67864    0.49748  -5.384 7.27e-08 ***
## MOPLMIDD9     -2.87221    0.44908  -6.396 1.60e-10 ***
## MOPLLAAG1     -0.58933    0.28750  -2.050 0.040379 *  
## MOPLLAAG2     -0.50970    0.25183  -2.024 0.042972 *  
## MOPLLAAG3     -0.83420    0.25112  -3.322 0.000894 ***
## MOPLLAAG4     -1.50497    0.25958  -5.798 6.72e-09 ***
## MOPLLAAG5     -1.86201    0.27526  -6.764 1.34e-11 ***
## MOPLLAAG6     -2.13325    0.29978  -7.116 1.11e-12 ***
## MOPLLAAG7     -2.37819    0.33730  -7.051 1.78e-12 ***
## MOPLLAAG8     -2.61895    0.41504  -6.310 2.79e-10 ***
## MOPLLAAG9     -4.63719    0.45506 -10.190  < 2e-16 ***
## MBERHOOG1     -0.21111    0.11691  -1.806 0.070965 .  
## MBERHOOG2      0.42619    0.13162   3.238 0.001204 ** 
## MBERHOOG3      0.06442    0.18401   0.350 0.726263    
## MBERHOOG4      0.77379    0.23736   3.260 0.001114 ** 
## MBERHOOG5      0.25770    0.31671   0.814 0.415830    
## MBERHOOG6      1.82551    0.38131   4.787 1.69e-06 ***
## MBERHOOG7      2.48575    0.47462   5.237 1.63e-07 ***
## MBERHOOG8      4.38061    0.69155   6.334 2.38e-10 ***
## MBERHOOG9      1.04553    0.66383   1.575 0.115257    
## MBERZELF1      0.24292    0.10194   2.383 0.017175 *  
## MBERZELF2      0.43503    0.15061   2.889 0.003870 ** 
## MBERZELF3      0.61528    0.43048   1.429 0.152917    
## MBERZELF4     -0.04689    0.69496  -0.067 0.946209    
## MBERZELF5     -0.27715    0.50437  -0.549 0.582670    
## MBERBOER1      0.22806    0.10311   2.212 0.026989 *  
## MBERBOER2      0.26898    0.16473   1.633 0.102513    
## MBERBOER3      1.06456    0.29376   3.624 0.000290 ***
## MBERBOER4     -0.34118    0.45635  -0.748 0.454681    
## MBERBOER5      3.45087    0.57858   5.964 2.46e-09 ***
## MBERBOER6    -12.21960 1211.19733  -0.010 0.991950    
## MBERBOER7      4.28602 3679.77206   0.001 0.999071    
## MBERBOER8      9.57676 2624.35449   0.004 0.997088    
## MBERBOER9      6.53143 3285.83279   0.002 0.998414    
## MBERMIDD1     -0.72007    0.20360  -3.537 0.000405 ***
## MBERMIDD2      0.07983    0.17384   0.459 0.646065    
## MBERMIDD3      0.06615    0.20720   0.319 0.749511    
## MBERMIDD4      0.87849    0.25001   3.514 0.000442 ***
## MBERMIDD5      0.82237    0.31053   2.648 0.008091 ** 
## MBERMIDD6      1.53755    0.36916   4.165 3.11e-05 ***
## MBERMIDD7      1.55577    0.45750   3.401 0.000672 ***
## MBERMIDD8    -19.80784 1294.50449  -0.015 0.987792    
## MBERMIDD9      2.67522    0.55795   4.795 1.63e-06 ***
## MBERARBG1      1.13599    0.13617   8.342  < 2e-16 ***
## MBERARBG2      0.98691    0.15747   6.267 3.68e-10 ***
## MBERARBG3      0.66523    0.20241   3.286 0.001014 ** 
## MBERARBG4      0.84202    0.25466   3.306 0.000945 ***
## MBERARBG5      1.78966    0.32328   5.536 3.09e-08 ***
## MBERARBG6      0.03182    0.42382   0.075 0.940145    
## MBERARBG7      3.51117    0.56369   6.229 4.70e-10 ***
## MBERARBG8      2.91265    0.62167   4.685 2.80e-06 ***
## MBERARBG9      4.47032    0.82846   5.396 6.82e-08 ***
## MBERARBO1     -0.14848    0.13040  -1.139 0.254845    
## MBERARBO2     -0.05052    0.14875  -0.340 0.734118    
## MBERARBO3      0.25410    0.18450   1.377 0.168451    
## MBERARBO4      0.89950    0.23547   3.820 0.000133 ***
## MBERARBO5      1.32839    0.31491   4.218 2.46e-05 ***
## MBERARBO6      0.61644    0.41215   1.496 0.134736    
## MBERARBO7      1.56012    0.61400   2.541 0.011056 *  
## MBERARBO8      4.34834    0.87024   4.997 5.83e-07 ***
## MBERARBO9    -16.14269  948.86166  -0.017 0.986426    
## MSKC1          0.62343    0.26970   2.312 0.020804 *  
## MSKC2          0.58111    0.23777   2.444 0.014524 *  
## MSKC3          0.59414    0.23462   2.532 0.011331 *  
## MSKC4          0.84329    0.24768   3.405 0.000662 ***
## MSKC5          0.91687    0.25345   3.618 0.000297 ***
## MSKC6          1.46236    0.28491   5.133 2.86e-07 ***
## MSKC7          1.40819    0.31689   4.444 8.84e-06 ***
## MSKC8          2.59305    0.40569   6.392 1.64e-10 ***
## MSKC9          0.18280    0.44003   0.415 0.677830    
## MSKD1          0.11917    0.09555   1.247 0.212333    
## MSKD2         -0.03512    0.11397  -0.308 0.757974    
## MSKD3         -0.41328    0.16551  -2.497 0.012521 *  
## MSKD4         -0.44892    0.24994  -1.796 0.072474 .  
## MSKD5         -1.70222    0.60611  -2.808 0.004978 ** 
## MSKD6        -16.10247  520.21397  -0.031 0.975307    
## MSKD7          2.98897    1.17222   2.550 0.010778 *  
## MSKD9        -14.59765 6522.63864  -0.002 0.998214    
## MHKOOP1       -0.55352    0.18145  -3.051 0.002285 ** 
## MHKOOP2       -0.86983    0.16796  -5.179 2.23e-07 ***
## MHKOOP3       -0.34876    0.16428  -2.123 0.033758 *  
## MHKOOP4       -0.42117    0.16418  -2.565 0.010309 *  
## MHKOOP5       -0.42347    0.15858  -2.670 0.007577 ** 
## MHKOOP6       -0.17152    0.15691  -1.093 0.274343    
## MHKOOP7       -0.64088    0.15196  -4.217 2.47e-05 ***
## MHKOOP8       -0.26383    0.16537  -1.595 0.110621    
## MHKOOP9       -0.19365    0.15298  -1.266 0.205552    
## MAUT11         9.66462 7969.81180   0.001 0.999032    
## MAUT12        12.79757 4371.09442   0.003 0.997664    
## MAUT13        12.43480 4371.09436   0.003 0.997730    
## MAUT14        15.52138 4371.09434   0.004 0.997167    
## MAUT15        16.09448 4371.09435   0.004 0.997062    
## MAUT16        16.14883 4371.09436   0.004 0.997052    
## MAUT17        16.45876 4371.09436   0.004 0.996996    
## MAUT18        16.57530 4371.09438   0.004 0.996974    
## MAUT19        15.84595 4371.09439   0.004 0.997108    
## MAUT21         0.21571    0.11864   1.818 0.069037 .  
## MAUT22        -0.13414    0.16256  -0.825 0.409269    
## MAUT23        -0.25076    0.25931  -0.967 0.333518    
## MAUT24         0.33153    0.34615   0.958 0.338187    
## MAUT25         0.07256    0.58927   0.123 0.901996    
## MAUT26         4.40776    1.15852   3.805 0.000142 ***
## MAUT27         2.88501 6619.67724   0.000 0.999652    
## MAUT01        -0.11959    0.14176  -0.844 0.398896    
## MAUT02         0.34426    0.16545   2.081 0.037454 *  
## MAUT03         0.76243    0.25202   3.025 0.002484 ** 
## MAUT04        -0.96359    0.35664  -2.702 0.006896 ** 
## MAUT05         0.87832    0.50280   1.747 0.080664 .  
## MAUT06         4.07294    0.89493   4.551 5.34e-06 ***
## MAUT07       -57.38433 1092.31229  -0.053 0.958103    
## MAUT08        -6.10220 6772.90905  -0.001 0.999281    
## MAUT09        -3.16188 4526.53656  -0.001 0.999443    
## MINK30451      0.11489    0.20108   0.571 0.567759    
## MINK30452      0.40932    0.15932   2.569 0.010193 *  
## MINK30453      0.22125    0.15447   1.432 0.152071    
## MINK30454      0.05235    0.15478   0.338 0.735204    
## MINK30455     -0.10527    0.16433  -0.641 0.521802    
## MINK30456     -0.06767    0.18046  -0.375 0.707673    
## MINK30457      0.17984    0.22223   0.809 0.418375    
## MINK30458      0.09146    0.40537   0.226 0.821496    
## MINK30459      1.49863    0.29286   5.117 3.10e-07 ***
## MINK75121      0.55873    0.09104   6.137 8.40e-10 ***
## MINK75122      0.20065    0.10795   1.859 0.063078 .  
## MINK75123      0.18764    0.15337   1.223 0.221142    
## MINK75124      0.66149    0.19793   3.342 0.000831 ***
## MINK75125     -0.63363    0.29297  -2.163 0.030557 *  
## MINK75126    -18.04942 1826.99560  -0.010 0.992118    
## MINK75127    -21.48448 6522.63864  -0.003 0.997372    
## MINK75128    -20.17604 2453.54957  -0.008 0.993439    
## MINK75129      3.81007    0.90977   4.188 2.81e-05 ***
## MINK123M1     -0.23754    0.10816  -2.196 0.028086 *  
## MINK123M2     -0.05399    0.24291  -0.222 0.824110    
## MINK123M3     -1.14967    0.40195  -2.860 0.004233 ** 
## MINK123M4    -18.27140 1129.29670  -0.016 0.987091    
## MINK123M5    -19.42596 6522.63862  -0.003 0.997624    
## MINK123M7    -23.06411 6522.63865  -0.004 0.997179    
## MINK123M9    -17.33042 6522.63868  -0.003 0.997880    
## MKOOPKLA2      0.42918    0.70559   0.608 0.543019    
## MKOOPKLA3    -15.74138 1366.24300  -0.012 0.990807    
## MKOOPKLA4    -14.76526 1366.24305  -0.011 0.991377    
## MKOOPKLA5    -14.14797 1366.24311  -0.010 0.991738    
## MKOOPKLA6    -11.85123 1366.24377  -0.009 0.993079    
## MKOOPKLA7    -12.64922 1366.24385  -0.009 0.992613    
## MKOOPKLA8    -12.72138 1366.24391  -0.009 0.992571    
## PPERSAUT4    -17.74738 6522.63861  -0.003 0.997829    
## PPERSAUT5     -0.23505    0.11815  -1.989 0.046648 *  
## PPERSAUT6      1.79486    0.06328  28.364  < 2e-16 ***
## PPERSAUT7    -17.69278  815.91470  -0.022 0.982700    
## PPERSAUT8    -16.21935 4248.45184  -0.004 0.996954    
## PMOTSCO3       3.60728    1.56510   2.305 0.021176 *  
## PMOTSCO4      -0.19549    0.17860  -1.095 0.273729    
## PMOTSCO5       0.12055    0.30648   0.393 0.694079    
## PMOTSCO6      -2.73265    0.53918  -5.068 4.02e-07 ***
## PMOTSCO7     -18.81346 4310.32002  -0.004 0.996517    
## PVRAAUT4     -19.00878 6522.63861  -0.003 0.997675    
## PVRAAUT6     -16.93379 2122.80799  -0.008 0.993635    
## PVRAAUT9       0.30845 6573.47178   0.000 0.999963    
## PAANHANG1      1.49536    0.45640   3.276 0.001051 ** 
## PAANHANG2      0.93465    0.37819   2.471 0.013461 *  
## PAANHANG3    -16.41691 2219.91357  -0.007 0.994099    
## PAANHANG4    -19.03218 6522.63862  -0.003 0.997672    
## PAANHANG5     -2.59521 6859.38251   0.000 0.999698    
## PWERKT2      -16.15994 2250.50380  -0.007 0.994271    
## PWERKT3      -16.41642 2024.09365  -0.008 0.993529    
## PWERKT4      -15.10514 1862.13909  -0.008 0.993528    
## PWERKT6      -13.59613 2980.08389  -0.005 0.996360    
## PWAOREG4     -19.01229 6522.63861  -0.003 0.997674    
## PWAOREG5     -15.88444 6522.63862  -0.002 0.998057    
## PWAOREG6       3.03697    0.41525   7.314 2.60e-13 ***
## PWAOREG7     -18.78709 3215.37010  -0.006 0.995338    
## PPLEZIER1      5.28855    1.58056   3.346 0.000820 ***
## PPLEZIER2      0.28372    1.46690   0.193 0.846636    
## PPLEZIER3      2.87275    1.45969   1.968 0.049062 *  
## PPLEZIER4      0.90003    1.21630   0.740 0.459318    
## PPLEZIER5    -16.16662 3841.44362  -0.004 0.996642    
## PPLEZIER6     50.42224  854.52137   0.059 0.952947    
## AWAPART1       0.45023    0.07350   6.126 9.04e-10 ***
## AWAPART2     -17.91175 2140.36143  -0.008 0.993323    
## AWALAND1      -1.62608    0.27649  -5.881 4.07e-09 ***
## ABROM1        -0.81126    0.15225  -5.328 9.90e-08 ***
## ABROM2       -17.50707 1537.26921  -0.011 0.990914    
## ALEVEN1       -0.93333    0.17662  -5.284 1.26e-07 ***
## ALEVEN2        0.06703    0.18209   0.368 0.712793    
## ALEVEN3        0.35697    0.45900   0.778 0.436738    
## ALEVEN4        2.11320    0.65493   3.227 0.001253 ** 
## ALEVEN8       -0.83679 7784.22492   0.000 0.999914    
## APERSONG1     -3.87707    0.91367  -4.243 2.20e-05 ***
## AGEZONG1       0.25353    0.30356   0.835 0.403601    
## ABRAND1        0.38349    0.07494   5.117 3.10e-07 ***
## ABRAND2        0.16098    0.19968   0.806 0.420133    
## ABRAND3      -19.01316 1746.28080  -0.011 0.991313    
## ABRAND4      -16.83111 2931.62747  -0.006 0.995419    
## ABRAND5      -12.93114 3884.72146  -0.003 0.997344    
## ABRAND7      -16.29095 6522.63861  -0.002 0.998007    
## APLEZIER1      0.99051    1.31145   0.755 0.450082    
## APLEZIER2           NA         NA      NA       NA    
## AFIETS1        1.00642    0.19752   5.095 3.48e-07 ***
## AFIETS2        0.74848    0.29038   2.578 0.009948 ** 
## AFIETS3        2.79384    1.17912   2.369 0.017816 *  
## ABYSTAND1      0.78205    0.19977   3.915 9.05e-05 ***
## ABYSTAND2    -18.64288 6522.63861  -0.003 0.997720    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15177.2  on 10947  degrees of freedom
## Residual deviance:  9775.8  on 10662  degrees of freedom
## AIC: 10348
## 
## Number of Fisher Scoring iterations: 17
drewMatrix(model.2, over_test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2001  110
##          1 1761  128
##                                           
##                Accuracy : 0.5322          
##                  95% CI : (0.5166, 0.5478)
##     No Information Rate : 0.9405          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0164          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.53782         
##             Specificity : 0.53190         
##          Pos Pred Value : 0.06776         
##          Neg Pred Value : 0.94789         
##              Prevalence : 0.05950         
##          Detection Rate : 0.03200         
##    Detection Prevalence : 0.47225         
##       Balanced Accuracy : 0.53486         
##                                           
##        'Positive' Class : 1               
## 
# difference in deviance =  Null deviance (15177.2) - 9766.3 = 5411
# Sensitivity : 64%
# Accuracy : 54%

explanation

Model 3 with Domain knowledge

corrplot(cor(subset(df_train , select = c("PBRAND", "MOSTYPE", "PPERSAUT", "MKOOPKLA", "MHKOOP", "CARAVAN"))), method = "number", type = "upper")

train_3 = over_train
for(i in 1:ncol(train_3)){
train_3[,i] <- as.factor(train_3[,i])
}
model.3 = glm(formula = CARAVAN ~ PBRAND + MOSTYPE + PPERSAUT + MKOOPKLA+MHKOOP, family = binomial(link = "logit"), 
    data = train_3)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model.3$xlevels[["PPERSAUT"]] <- union(model.3$xlevels[["PPERSAUT"]], levels(over_test$PPERSAUT))

predicted_3 = predict(model.3, over_test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
predictedClass_3 = ifelse(predicted_3>=0.5, 1, 0)

drewSummary(model.3)
## 
## Call:
## glm(formula = CARAVAN ~ PBRAND + MOSTYPE + PPERSAUT + MKOOPKLA + 
##     MHKOOP, family = binomial(link = "logit"), data = train_3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2188  -0.8846   0.2112   0.8877   2.0655  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   13.56737  542.41896   0.025 0.980045    
## PBRAND1       -0.44457    0.18805  -2.364 0.018071 *  
## PBRAND2       -1.09720    0.13286  -8.258  < 2e-16 ***
## PBRAND3        0.70914    0.06417  11.051  < 2e-16 ***
## PBRAND4        0.86543    0.05584  15.498  < 2e-16 ***
## PBRAND5        0.01437    0.14206   0.101 0.919446    
## PBRAND6       -0.71539    0.19039  -3.758 0.000172 ***
## PBRAND7      -16.09720  662.23080  -0.024 0.980607    
## PBRAND8      -16.67399 2399.54472  -0.007 0.994456    
## MOSTYPE2      -1.55692    0.49312  -3.157 0.001592 ** 
## MOSTYPE3      -1.57461    0.47086  -3.344 0.000825 ***
## MOSTYPE4      -2.27590    0.52690  -4.319 1.56e-05 ***
## MOSTYPE5       0.21351    1.10997   0.192 0.847463    
## MOSTYPE6      -0.19143    0.18977  -1.009 0.313093    
## MOSTYPE7      -1.79296    0.50819  -3.528 0.000418 ***
## MOSTYPE8      -0.21303    0.35093  -0.607 0.543824    
## MOSTYPE9      -0.91550    1.04692  -0.874 0.381859    
## MOSTYPE10     -0.55308    0.18924  -2.923 0.003471 ** 
## MOSTYPE11     -2.10475    0.46146  -4.561 5.09e-06 ***
## MOSTYPE12     -0.12282    0.35074  -0.350 0.726202    
## MOSTYPE13     -1.62817    0.46996  -3.464 0.000531 ***
## MOSTYPE15    -30.05373 1154.96824  -0.026 0.979240    
## MOSTYPE16    -30.95223  748.96156  -0.041 0.967035    
## MOSTYPE17    -17.31495  724.59387  -0.024 0.980936    
## MOSTYPE18    -30.79590  728.77089  -0.042 0.966294    
## MOSTYPE19    -16.48050 1235.29830  -0.013 0.989355    
## MOSTYPE20    -14.89542  542.41874  -0.027 0.978092    
## MOSTYPE21    -30.72381  784.93331  -0.039 0.968777    
## MOSTYPE22    -15.62265  542.41870  -0.029 0.977023    
## MOSTYPE23     -0.50915    1.08747  -0.468 0.639643    
## MOSTYPE24    -15.73582  542.41873  -0.029 0.976856    
## MOSTYPE25    -15.06992  542.41897  -0.028 0.977835    
## MOSTYPE26    -15.51812  542.41905  -0.029 0.977176    
## MOSTYPE27    -15.31559  542.41907  -0.028 0.977474    
## MOSTYPE28    -30.61014  696.61845  -0.044 0.964951    
## MOSTYPE29     -0.57512    1.10107  -0.522 0.601440    
## MOSTYPE30    -15.85993  542.41873  -0.029 0.976674    
## MOSTYPE31    -15.57445  542.41898  -0.029 0.977094    
## MOSTYPE32    -14.40755  542.41897  -0.027 0.978809    
## MOSTYPE33      0.28391    1.07957   0.263 0.792566    
## MOSTYPE34     -2.19298    0.48035  -4.565 4.99e-06 ***
## MOSTYPE35     -1.83656    1.01195  -1.815 0.069541 .  
## MOSTYPE36      0.85769    1.08415   0.791 0.428874    
## MOSTYPE37     -0.55322    1.04391  -0.530 0.596148    
## MOSTYPE38     -0.33805    1.04363  -0.324 0.745997    
## MOSTYPE39     -1.30089    1.01238  -1.285 0.198796    
## MOSTYPE40    -15.96420  253.43789  -0.063 0.949774    
## MOSTYPE41     -1.28891    1.05010  -1.227 0.219667    
## PPERSAUT4    -15.30126 2399.54472  -0.006 0.994912    
## PPERSAUT5     -0.17935    0.09533  -1.881 0.059930 .  
## PPERSAUT6      1.48920    0.04958  30.034  < 2e-16 ***
## PPERSAUT7    -15.65631  332.76882  -0.047 0.962474    
## PPERSAUT8    -15.11916 1115.20566  -0.014 0.989183    
## MKOOPKLA2      0.56639    0.52117   1.087 0.277144    
## MKOOPKLA3    -15.33692  542.41790  -0.028 0.977443    
## MKOOPKLA4    -14.09738  542.41796  -0.026 0.979265    
## MKOOPKLA5    -13.53048  542.41802  -0.025 0.980099    
## MKOOPKLA6    -12.98108  542.41876  -0.024 0.980907    
## MKOOPKLA7    -13.90142  542.41884  -0.026 0.979554    
## MKOOPKLA8    -14.67041  542.41893  -0.027 0.978423    
## MHKOOP1       -0.23423    0.11634  -2.013 0.044079 *  
## MHKOOP2       -0.17357    0.11482  -1.512 0.130606    
## MHKOOP3        0.22847    0.11719   1.950 0.051232 .  
## MHKOOP4        0.08435    0.10780   0.783 0.433912    
## MHKOOP5       -0.12754    0.10708  -1.191 0.233640    
## MHKOOP6        0.23910    0.10357   2.308 0.020972 *  
## MHKOOP7        0.03356    0.10105   0.332 0.739770    
## MHKOOP8        0.56480    0.11241   5.024 5.05e-07 ***
## MHKOOP9        0.34188    0.09857   3.468 0.000524 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15177  on 10947  degrees of freedom
## Residual deviance: 11950  on 10879  degrees of freedom
## AIC: 12088
## 
## Number of Fisher Scoring iterations: 15
drewMatrix(model.3, over_test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2171   74
##          1 1591  164
##                                           
##                Accuracy : 0.5838          
##                  95% CI : (0.5683, 0.5991)
##     No Information Rate : 0.9405          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0668          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.68908         
##             Specificity : 0.57709         
##          Pos Pred Value : 0.09345         
##          Neg Pred Value : 0.96704         
##              Prevalence : 0.05950         
##          Detection Rate : 0.04100         
##    Detection Prevalence : 0.43875         
##       Balanced Accuracy : 0.63308         
##                                           
##        'Positive' Class : 1               
## 
getRMSE(predictedClass_3)
## [1] 0.8110179
# Area under curve
pr <- prediction(predictedClass_3,over_test$CARAVAN)
perf <- performance(pr,measure = "tpr",x.measure = "fpr")
plot(perf) > auc(over_test$CARAVAN,predictedClass_3)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## logical(0)
auc_ROCR <- performance(pr, measure = "auc")
auc_ROCR <- auc_ROCR@y.values[[1]]
auc_ROCR
## [1] 0.6330811
pR2(model.3)['McFadden']
## fitting null model for pseudo-r2
##  McFadden 
## 0.2126512
# difference in deviance =  Null deviance (15177) - 12051 = 3025
# sensivity 72%
# Accuracy : 60%

Explanation AUC roc plot logistic regression Our AUC score is 0.7176653

Evaluting performance of model 3

pred_t <- predict(model.3, na.action=na.pass)
hist(pred_t)

boxplot(pred_t)

##Plotting residual histograms for training and validation data
resid.t<-residuals(model.3)
hist(resid.t)

ROC Model 3

drewROC(model.3)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

Lift chart

lift.example <- lift(relevel(as.factor(over_test$CARAVAN), ref="1") ~ predicted_3, data = over_test)
#xyplot(lift.example, plot = "gain")

Decil Wise chart

library(gains)
actual = as.numeric(over_test$CARAVAN)
predicted_3_num = as.numeric(predicted_3)
gain = gains(actual, predicted_3_num)

barplot(gain$mean.resp / mean(actual), names.arg = gain$depth, xlab = "Percentile", ylab = "Mean Response", main = "Decile-wise lift chart")

train_4 = over_train
for(i in 1:ncol(train_4)){
train_4[,i] <- as.factor(train_4[,i])
}

Decision tree.

fit1 = rpart(formula=CARAVAN ~ .,data=over_train,method = 'class', control=rpart.control(minsplit=20, minbucket=1, cp=0.008))

fit1
## n= 10948 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 10948 5474 0 (0.50000000 0.50000000)  
##     2) PPERSAUT=0,4,5,7,8 4772 1355 0 (0.71605197 0.28394803)  
##       4) MOSTYPE=3,4,6,9,10,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,33,34,35,40,41 2532  376 0 (0.85150079 0.14849921) *
##       5) MOSTYPE=1,2,5,7,8,11,12,13,32,36,37,38,39 2240  979 0 (0.56294643 0.43705357)  
##        10) PBRAND=1,2,5,6,7,8 214   11 0 (0.94859813 0.05140187) *
##        11) PBRAND=0,3,4 2026  968 0 (0.52221125 0.47778875)  
##          22) MGODPR=0,1,4,8 515  140 0 (0.72815534 0.27184466) *
##          23) MGODPR=2,3,5,6,7,9 1511  683 1 (0.45201853 0.54798147)  
##            46) PBRAND=0,3 1004  456 0 (0.54581673 0.45418327)  
##              92) MBERARBG=0,3,4,6,8 387   91 0 (0.76485788 0.23514212) *
##              93) MBERARBG=1,2,5,7 617  252 1 (0.40842788 0.59157212)  
##               186) MINKM30=3,4,7,9 56    0 0 (1.00000000 0.00000000) *
##               187) MINKM30=0,1,2,5,6 561  196 1 (0.34937611 0.65062389) *
##            47) PBRAND=4 507  135 1 (0.26627219 0.73372781) *
##     3) PPERSAUT=6 6176 2057 1 (0.33306347 0.66693653)  
##       6) PBRAND=0,1,2,6,7 2287 1131 1 (0.49453432 0.50546568)  
##        12) MOSTYPE=5,7,9,16,17,18,19,21,22,23,26,28,29,31,32,40,41 419   65 0 (0.84486874 0.15513126) *
##        13) MOSTYPE=1,2,3,4,6,8,10,11,12,13,20,24,25,27,30,33,34,35,36,37,38,39 1868  777 1 (0.41595289 0.58404711)  
##          26) MSKA=0,2,8,9 783  345 0 (0.55938697 0.44061303)  
##            52) MOSTYPE=2,4,6,13,20,25,27,30,33,34,35,39 361   89 0 (0.75346260 0.24653740) *
##            53) MOSTYPE=1,3,8,10,11,12,24,36,37,38 422  166 1 (0.39336493 0.60663507)  
##             106) MGODGE=0,1,3,6,7 93   14 0 (0.84946237 0.15053763) *
##             107) MGODGE=2,4,5 329   87 1 (0.26443769 0.73556231) *
##          27) MSKA=1,3,4,5,6,7 1085  339 1 (0.31244240 0.68755760)  
##            54) MBERARBO=2,5,6,7 242   96 0 (0.60330579 0.39669421)  
##             108) MOSTYPE=1,2,3,4,6,8,10,11,13,20,24,27,30,33,36,38 130   15 0 (0.88461538 0.11538462) *
##             109) MOSTYPE=12,25,34,35,37,39 112   31 1 (0.27678571 0.72321429) *
##            55) MBERARBO=0,1,3,4,8 843  193 1 (0.22894425 0.77105575) *
##       7) PBRAND=3,4,5 3889  926 1 (0.23810748 0.76189252) *
printcp(fit1)
## 
## Classification tree:
## rpart(formula = CARAVAN ~ ., data = over_train, method = "class", 
##     control = rpart.control(minsplit = 20, minbucket = 1, cp = 0.008))
## 
## Variables actually used in tree construction:
## [1] MBERARBG MBERARBO MGODGE   MGODPR   MINKM30  MOSTYPE  MSKA     PBRAND  
## [9] PPERSAUT
## 
## Root node error: 5474/10948 = 0.5
## 
## n= 10948 
## 
##          CP nsplit rel error  xerror      xstd
## 1 0.3766898      0   1.00000 1.02320 0.0095547
## 2 0.0263975      1   0.62331 0.62349 0.0088540
## 3 0.0169894      3   0.57052 0.57526 0.0086523
## 4 0.0164414      4   0.55353 0.56284 0.0085956
## 5 0.0118743      5   0.53708 0.54786 0.0085246
## 6 0.0091341      6   0.52521 0.52576 0.0084141
## 7 0.0088296      8   0.50694 0.47497 0.0081340
## 8 0.0080000     14   0.43277 0.45177 0.0079930
fancyRpartPlot(fit1)

glm_6 = glm(formula = CARAVAN ~  MINKGEM+PBRAND+PPERSAUT, family = binomial(link = "logit"),data = train_4)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm_6$xlevels[["PPERSAUT"]] <- union(glm_6$xlevels[["PPERSAUT"]], levels(over_test$PPERSAUT))
summary(glm_6)
## 
## Call:
## glm(formula = CARAVAN ~ MINKGEM + PBRAND + PPERSAUT, family = binomial(link = "logit"), 
##     data = train_4)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1010  -0.8672   0.2414   0.9333   2.1897  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -16.54726  264.87882  -0.062   0.9502    
## MINKGEM1      14.09381  264.87905   0.053   0.9576    
## MINKGEM2      14.88888  264.87883   0.056   0.9552    
## MINKGEM3      14.92099  264.87882   0.056   0.9551    
## MINKGEM4      15.71555  264.87882   0.059   0.9527    
## MINKGEM5      15.76304  264.87883   0.060   0.9525    
## MINKGEM6      15.33140  264.87883   0.058   0.9538    
## MINKGEM7      16.27836  264.87885   0.061   0.9510    
## MINKGEM8      16.04392  264.87888   0.061   0.9517    
## MINKGEM9       0.09645  388.01649   0.000   0.9998    
## PBRAND1       -0.47576    0.17996  -2.644   0.0082 ** 
## PBRAND2       -1.23404    0.12810  -9.633  < 2e-16 ***
## PBRAND3        0.64429    0.06098  10.566  < 2e-16 ***
## PBRAND4        0.94664    0.05322  17.788  < 2e-16 ***
## PBRAND5       -0.02308    0.13423  -0.172   0.8635    
## PBRAND6       -1.07605    0.18146  -5.930 3.03e-09 ***
## PBRAND7      -14.87397  406.05762  -0.037   0.9708    
## PBRAND8      -14.73435 1455.39753  -0.010   0.9919    
## PPERSAUT4    -14.73435 1455.39753  -0.010   0.9919    
## PPERSAUT5     -0.20002    0.09303  -2.150   0.0315 *  
## PPERSAUT6      1.41281    0.04754  29.718  < 2e-16 ***
## PPERSAUT7    -14.57002  204.32470  -0.071   0.9432    
## PPERSAUT8    -14.49846  672.22843  -0.022   0.9828    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15177  on 10947  degrees of freedom
## Residual deviance: 12260  on 10925  degrees of freedom
## AIC: 12306
## 
## Number of Fisher Scoring iterations: 14
predicted_6 = predict(glm_6, over_test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
predictedClass_6 = ifelse(predicted_6>=0.5, 1, 0)


confusionMatrix(as.factor(predictedClass_6), as.factor(over_test$CARAVAN), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2595   86
##          1 1167  152
##                                           
##                Accuracy : 0.6868          
##                  95% CI : (0.6721, 0.7011)
##     No Information Rate : 0.9405          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.105           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6387          
##             Specificity : 0.6898          
##          Pos Pred Value : 0.1152          
##          Neg Pred Value : 0.9679          
##              Prevalence : 0.0595          
##          Detection Rate : 0.0380          
##    Detection Prevalence : 0.3297          
##       Balanced Accuracy : 0.6642          
##                                           
##        'Positive' Class : 1               
## 
accuracy(predictedClass_6, as.numeric(over_test$CARAVAN))
##               ME     RMSE     MAE    MPE   MAPE
## Test set 0.72975 0.879062 0.72975 68.925 68.925
pR2(glm_6)['McFadden']
## fitting null model for pseudo-r2
##  McFadden 
## 0.1922143
# difference in deviance =  Null deviance (15177) - 12069 = 3108
# Accuracy 68%
# Sensitivity 69%

ROC model 4

drewROC(glm_6)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

Model 1 and 2 comparison

anova(logit.reg,model.2,test = 'Chisq')
## Analysis of Deviance Table
## 
## Model 1: CARAVAN ~ MOSHOOFD + MGEMOMV + OneHouse + MINKGEM_c + MGEMLEEF_c
## Model 2: CARAVAN ~ MOSTYPE + MGODRK + MGODPR + MGODOV + MRELGE + MRELSA + 
##     MOPLMIDD + MOPLLAAG + MBERHOOG + MBERZELF + MBERBOER + MBERMIDD + 
##     MBERARBG + MBERARBO + MSKC + MSKD + MHKOOP + MAUT1 + MAUT2 + 
##     MAUT0 + MINK3045 + MINK7512 + MINK123M + MKOOPKLA + PPERSAUT + 
##     PMOTSCO + PVRAAUT + PAANHANG + PWERKT + PWAOREG + PPLEZIER + 
##     AWAPART + AWALAND + ABROM + ALEVEN + APERSONG + AGEZONG + 
##     ABRAND + APLEZIER + AFIETS + ABYSTAND
##   Resid. Df Resid. Dev  Df Deviance  Pr(>Chi)    
## 1     10931    14375.5                           
## 2     10662     9775.8 269   4599.7 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Model 3 and 4 comparison

anova(model.3,glm_6,test = 'Chisq')
## Analysis of Deviance Table
## 
## Model 1: CARAVAN ~ PBRAND + MOSTYPE + PPERSAUT + MKOOPKLA + MHKOOP
## Model 2: CARAVAN ~ MINKGEM + PBRAND + PPERSAUT
##   Resid. Df Resid. Dev  Df Deviance  Pr(>Chi)    
## 1     10879      11950                           
## 2     10925      12260 -46  -310.17 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1